Personal Assignment 2
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
| 5 | 2.0 | -0.425966 | 0.960523 | 1.141109 | -0.168252 | 0.420987 | -0.029728 | 0.476201 | 0.260314 | -0.568671 | ... | -0.208254 | -0.559825 | -0.026398 | -0.371427 | -0.232794 | 0.105915 | 0.253844 | 0.081080 | 3.67 | 0 |
6 rows × 31 columns
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 284807.000000 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | ... | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 284807.000000 | 284807.000000 |
| mean | 94813.859575 | 1.168375e-15 | 3.416908e-16 | -1.379537e-15 | 2.074095e-15 | 9.604066e-16 | 1.487313e-15 | -5.556467e-16 | 1.213481e-16 | -2.406331e-15 | ... | 1.654067e-16 | -3.568593e-16 | 2.578648e-16 | 4.473266e-15 | 5.340915e-16 | 1.683437e-15 | -3.660091e-16 | -1.227390e-16 | 88.349619 | 0.001727 |
| std | 47488.145955 | 1.958696e+00 | 1.651309e+00 | 1.516255e+00 | 1.415869e+00 | 1.380247e+00 | 1.332271e+00 | 1.237094e+00 | 1.194353e+00 | 1.098632e+00 | ... | 7.345240e-01 | 7.257016e-01 | 6.244603e-01 | 6.056471e-01 | 5.212781e-01 | 4.822270e-01 | 4.036325e-01 | 3.300833e-01 | 250.120109 | 0.041527 |
| min | 0.000000 | -5.640751e+01 | -7.271573e+01 | -4.832559e+01 | -5.683171e+00 | -1.137433e+02 | -2.616051e+01 | -4.355724e+01 | -7.321672e+01 | -1.343407e+01 | ... | -3.483038e+01 | -1.093314e+01 | -4.480774e+01 | -2.836627e+00 | -1.029540e+01 | -2.604551e+00 | -2.256568e+01 | -1.543008e+01 | 0.000000 | 0.000000 |
| 25% | 54201.500000 | -9.203734e-01 | -5.985499e-01 | -8.903648e-01 | -8.486401e-01 | -6.915971e-01 | -7.682956e-01 | -5.540759e-01 | -2.086297e-01 | -6.430976e-01 | ... | -2.283949e-01 | -5.423504e-01 | -1.618463e-01 | -3.545861e-01 | -3.171451e-01 | -3.269839e-01 | -7.083953e-02 | -5.295979e-02 | 5.600000 | 0.000000 |
| 50% | 84692.000000 | 1.810880e-02 | 6.548556e-02 | 1.798463e-01 | -1.984653e-02 | -5.433583e-02 | -2.741871e-01 | 4.010308e-02 | 2.235804e-02 | -5.142873e-02 | ... | -2.945017e-02 | 6.781943e-03 | -1.119293e-02 | 4.097606e-02 | 1.659350e-02 | -5.213911e-02 | 1.342146e-03 | 1.124383e-02 | 22.000000 | 0.000000 |
| 75% | 139320.500000 | 1.315642e+00 | 8.037239e-01 | 1.027196e+00 | 7.433413e-01 | 6.119264e-01 | 3.985649e-01 | 5.704361e-01 | 3.273459e-01 | 5.971390e-01 | ... | 1.863772e-01 | 5.285536e-01 | 1.476421e-01 | 4.395266e-01 | 3.507156e-01 | 2.409522e-01 | 9.104512e-02 | 7.827995e-02 | 77.165000 | 0.000000 |
| max | 172792.000000 | 2.454930e+00 | 2.205773e+01 | 9.382558e+00 | 1.687534e+01 | 3.480167e+01 | 7.330163e+01 | 1.205895e+02 | 2.000721e+01 | 1.559499e+01 | ... | 2.720284e+01 | 1.050309e+01 | 2.252841e+01 | 4.584549e+00 | 7.519589e+00 | 3.517346e+00 | 3.161220e+01 | 3.384781e+01 | 25691.160000 | 1.000000 |
8 rows × 31 columns
#show a correlation heat plot of the data
import seaborn as sns
plt.figure(figsize=(20,10))
sns.heatmap(card.corr(), cmap="YlGnBu")
plt.show()#4 Scatterplot matrix
import plotly.express as px
import plotly.io as pio
fig = px.scatter_matrix(my_sample, color="Class", title="Scatterplot Matrix of Time, Amount, Class, and Every 4th V Column")
fig.update_traces(marker=dict(size=2), selector=dict(diagonal='histogram'), showupperhalf=False)
fig.update_layout(font=dict(size=7, color='black'))
pio.write_html(fig, file='scatterplot_matrix.html')
fig.show()Explanation of Commented Code
The portions of code for training the model are commented out because they take 9ish hours to run. I have saved the results to a csv and loaded them back in to make the boxplot. Additionally, two models have been saved and will be loaded in to complete the notebook.
#6 Perform classification using methods mentioned in word doc
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from lightgbm import LGBMRegressor
import torch
# #I dont think sklearn supports gpu acceleration?
# #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# def train_eval(X_train, y_train, X_test, y_test):
# regressors = {
# 'Linear Regression': LinearRegression(),
# 'Decision Tree': DecisionTreeRegressor(),
# 'Random Forest': RandomForestRegressor(),
# 'K-Nearest Neighbors': KNeighborsRegressor(),
# 'Bagging': BaggingRegressor(),
# 'Gradient Boosting': GradientBoostingRegressor(),
# 'LightGBM': LGBMRegressor(),
# 'Ridge': Ridge(),
# 'Lasso': Lasso()}
# results, names = [], []
# for name, method in regressors.items():
# cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1276)
# scores = -cross_val_score(method, X_train, y_train, scoring= 'neg_mean_absolute_error', cv=cv, n_jobs=-1)
# results.append(scores)
# names.append(name)
# return results, names
# results, model_names = train_eval(X_train, y_train, X_test, y_test)#show box plot for results
def box_results(the_results):
#use px to make a box plot of the results from regressmod
fig = px.box(the_results, x="variable", y="value", title="Box Plot of Regression Models")
fig.show()
box_results(regressmod)This is a little more formal of a way to confirm what I’m seeing in the boxplots. Actually looks like I cant use Logistic Regression. So I’ll take it out
level0 = list()
level0.append(('bag', BaggingRegressor()))
level0.append(('dt', DecisionTreeRegressor()))
level0.append(('rf', RandomForestRegressor()))
level0.append(('knn', KNeighborsRegressor()))
print(level0)[('bag', BaggingRegressor()), ('dt', DecisionTreeRegressor()), ('rf', RandomForestRegressor()), ('knn', KNeighborsRegressor())]
#get an the mse for the predictions
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error: %.5f' % mse)Mean Squared Error: 0.00035
#9
#make a scatter plot of the predictions
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=.5, edgecolors='w', linewidths=.25)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
#add a grid
plt.grid(True, color='black', alpha=.25, linewidth=.25)
plt.tight_layout()
plt.show()